import numpy as np
import pandas as pd
n_cups = 20
n_milk_first = 10
opposite = n_cups - n_milk_first
P = np.concatenate((np.ones(n_milk_first),np.zeros(opposite)),axis=0) #axis=0
B = 100000

milk_estimates = pd.DataFrame({"milk_estimate":np.zeros(B)}) #딕셔너리를 데이터로 전달
for i in range(B):
    milk_first_choice = np.random.choice(P,n_milk_first,replace=False)
    milk_estimates.loc[i,"milk_estimate"] = np.sum(milk_first_choice)
milk_estimates.head(10)
milk_estimate
0 6.0
1 6.0
2 4.0
3 5.0
4 6.0
5 5.0
6 4.0
7 5.0
8 4.0
9 5.0
milk_estimates.plot.hist()
<AxesSubplot:ylabel='Frequency'>

observed_val = 8
a,b = milk_estimates[milk_estimates["milk_estimate"] >= observed_val].shape
a
1139
p_value = a/B
p_value
0.01139
n_cups = 8
n_milk_first = 4
opposite = n_cups - n_milk_first
P = np.concatenate((np.ones(n_milk_first),np.zeros(opposite)))
observed_val = 4

B = 1000000
estimates = pd.DataFrame({"milk_estimates":np.zeros(B)})
for i in range(B):
    sample_milk = np.random.choice(P,n_milk_first,replace=False)
    estimates.loc[i,"milk_estimates"] = np.sum(sample_milk)

a,b = estimates[estimates["milk_estimates"]>=observed_val].shape
p_value = a/B
print(p_value)
0.014372
B = 100000

observed_val = 620
sample = np.random.binomial(1000,0.54,B)
p_value = np.sum(sample >= observed_val)/B
p_value
0.0
np.random.binomial?
import scipy.stats as stats
binomial_dist = stats.binom(1000,0.54)
1 -binomial_dist.cdf(619)
1.929786053178617e-07
url = "https://ilovedata.github.io/teaching/bigdata2/data/physical_test_2018_data.csv"
physical_data = pd.read_csv(url)
twogroup = physical_data[["TEST_AGE",   "TEST_SEX", "ITEM_F001",    "ITEM_F002"]].rename(columns = {"ITEM_F001":"height","ITEM_F002":"weight"})
twogroup.head(10)
TEST_AGE TEST_SEX height weight
0 33 M 159.2 57.2
1 48 F 155.8 52.9
2 22 M 175.2 96.2
3 29 M 178.7 79.4
4 31 F 160.1 50.2
5 23 F 157.8 60.1
6 11 M 165.5 60.3
7 24 M 174.9 74.5
8 18 M 181.0 71.3
9 41 F 160.6 72.7
#randomnization
a,b = twogroup.shape
treatment = np.array([np.random.choice(["A","B"]) for i in np.arange(a)])
twogroup["treatment"] = treatment
twogroup.head(10)
TEST_AGE TEST_SEX height weight treatment
0 33 M 159.2 57.2 A
1 48 F 155.8 52.9 B
2 22 M 175.2 96.2 B
3 29 M 178.7 79.4 A
4 31 F 160.1 50.2 B
5 23 F 157.8 60.1 B
6 11 M 165.5 60.3 A
7 24 M 174.9 74.5 B
8 18 M 181.0 71.3 A
9 41 F 160.6 72.7 B
filename = "https://ilovedata.github.io/teaching/bigdata2/data/drug.csv"
three_drug_wide = pd.read_csv(filename)
three_drug_wide
Placebo Old New
0 31 23 23
1 28 17 17
2 34 29 11
3 36 23 11
4 33 17 9
5 27 17 16
6 39 22 16
7 25 17 14
8 23 23 17
9 29 25 18
10 36 17 10
11 36 24 7
12 32 19 14
13 30 18 24
14 33 5 18
15 39 26 17
16 30 33 14
17 32 16 14
18 30 20 16
19 28 27 11
three_drug_long = three_drug_wide.melt(value_vars = ["Placebo","Old","New"],value_name = "Value",var_name = "treatment")

신약그룹이 기존약 그룹보다 효과가 있는가?

  1. 귀무가설 : 효과없다.

\[H_0 : \bar x_{old} - \bar x_{new} = 0 \]

  1. 데이터 관측 & 검정통계량 설정

\[\bar x_{old} - \bar x_{new} = 0 \]

def diff_mean(df,treatment):
    twodrug = df.loc[(df.treatment == treatment[0]) | (df.treatment == treatment[1])].reset_index(drop=True) #reset_index 반드시 해주기
    group_mean = twodrug.groupby("treatment").mean().reset_index()
    print(group_mean)
    test_stat = float(group_mean.loc[group_mean.treatment == treatment[0],"Value"]) - float(group_mean.loc[group_mean.treatment == treatment[1],"Value"])
    return test_stat
diff_mean(three_drug_long,["Old","New"])
  treatment  Value
0       New  14.85
1       Old  20.90
6.049999999999999
  1. 귀무가설 하에서의 분포 설정
cond = (three_drug_long.treatment == "Old")|(three_drug_long.treatment == "New")
twodrug = three_drug_long[cond].reset_index(drop=True)
random_permuted_treat = np.array(twodrug.treatment.sample(frac=1.0,replace=False))
two_drug_permuted = twodrug
two_drug_permuted["permuted_treat"] = random_permuted_treat
two_drug_permuted
treatment Value permuted_treat
0 Old 23 Old
1 Old 17 New
2 Old 29 Old
3 Old 23 New
4 Old 17 New
5 Old 17 Old
6 Old 22 Old
7 Old 17 Old
8 Old 23 Old
9 Old 25 Old
10 Old 17 New
11 Old 24 Old
12 Old 19 Old
13 Old 18 New
14 Old 5 Old
15 Old 26 New
16 Old 33 New
17 Old 16 New
18 Old 20 New
19 Old 27 New
20 New 23 New
21 New 17 New
22 New 11 New
23 New 11 Old
24 New 9 Old
25 New 16 New
26 New 16 Old
27 New 14 Old
28 New 17 Old
29 New 18 Old
30 New 10 Old
31 New 7 New
32 New 14 New
33 New 24 Old
34 New 18 Old
35 New 17 New
36 New 14 New
37 New 14 New
38 New 16 New
39 New 11 Old